emanuela.damieto@studenti.unitn.it
This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(magrittr)
library(parallel)
library(plotly)
library(pvclust)
library(tidyverse)
library(tximport)
library(vsn)
library(emoji)
})source(here("UPSCb-common/src/R/featureSelection.R"))hpal <- colorRampPalette(c("blue","white","red"))(100)samples <- read_csv(here("data/drought_roots.csv"),
col_types=cols(.default=col_factor()))
samples <- samples[1:28,]tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.tsv.gz"), delim="\t", col_names=c("TXID","GENE"), skip=1))filelist <- list.files(here("results/Salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)stopifnot(all(match(sub("[1-3]_151118_BC852HANXX_P2503_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(filelist)))),
samples$SciLifeID) == 1:nrow(samples)))
samples_rep <-rbind(samples,samples)
samples_rep <- rbind(samples_rep,samples)samples_rep %<>% mutate(Filenames = filelist)write_tsv(samples_rep,here("data/samples_full_rank.txt"))txi <- suppressMessages(tximport(files = samples_rep$Filenames,
type = "salmon",
tx2gene=tx2gene))
counts <- txi$counts
colnames(counts) <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))
samples_rep$Filenames <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "8.1% percent (4306) of 53142 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples_rep)
ggplot(dat,aes(x,y,fill=Level)) +
geom_col() +
scale_y_continuous(name="reads") +
facet_grid(~ factor(Level), scales = "free") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,size=4), axis.title.x=element_blank()) +
labs(title ="Sample sequencing depth")👉 We observe difference in the raw sequencing depth
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("Gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)") +
theme_bw()👉 The cumulative gene coverage is as expected since the highest peak is around 2
dat <- as.data.frame(log10(counts)) %>%
utils::stack() %>%
mutate(Level=samples_rep$Level[match(ind,samples_rep$Filenames)])
ggplot(dat,aes(x=values,group=ind,col=Level)) +
geom_density(na.rm = TRUE) +
ggtitle("Sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()👉 All samples have the same sequencing depth characteristics and there is no deviation when we look at one or the other variable
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample condition and replicate.
dds <- DESeqDataSetFromTximport(
txi=txi,
colData = samples_rep,
design = ~ Level)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
save(dds,file=here("data/analysis/salmon/dds.rda"))(i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds) %>%
suppressMessages()
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)and without outliers:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)👉 The sequencing libraries size factor is highly variable across samples but similar for the technical replicates
Let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.
Before:
sd_mean <- meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])After VST normalization, the red line should be almost horizontal which indicates no dependency of variance on mean (homoscedastic).
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
sd_mean_vst <- meanSdPlot(vst[rowSums(vst)>0,])👉 We can conclude that the variance stabilization worked adequately even if the red line is not perfectly horizontal
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)We define the number of variables of the model (=1) and the number of possible combinations (=8).
We plot the percentage explained by different components
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)+
ggtitle("Percentage of variance explained by different components")pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(dds)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Level,shape=Level,text=Filenames)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.
👉 Some conditions clusterized better than others in the PCA plot
Number of genes expressed per condition at different cutoffs:
#conds <- factor(paste(dds$Level,dds$Filenames))
conds <- factor(dds$Level)
dev.null <- rangeSamplesSummary(counts=vst,
conditions=conds,
nrep=3)Filter for noise
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3) %>%
suppressWarnings()vst.cutoff <- 2
abline(h=10000, col="Red", lty=3)hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)plot(as.hclust(hm$colDendrogram),xlab="",sub="", cex.axis=2)Set the cut off to 6 in order to retrieve less than 10 000 genes
vst.cutoff <- 6
hm_reduced <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="", cex.axis=2)👉 The different conditions are not so different in gene expression level
Done to assess the previous dendrogram’s reproducibility
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 1000, parallel = TRUE)## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
Plot the clustering with bp and au
plot(hm.pvclust, labels = conds, cex.axis=1.5)
pvrect(hm.pvclust)👉 Some conditions clusterize better than others
print(hm.pvclust, digits=3)##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 0.012 0.560 0.451 0.037 0.030 0.005 -0.013 0.137 0.795
## 2 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 3 0.816 0.924 0.838 0.024 0.013 0.004 -1.210 0.223 0.858
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 0.499 0.789 0.658 0.037 0.023 0.005 -0.605 0.197 0.954
## 6 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 7 0.720 0.878 0.799 0.030 0.017 0.004 -1.003 0.164 0.858
## 8 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 9 0.676 0.860 0.770 0.032 0.019 0.004 -0.910 0.171 0.849
## 10 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 11 0.808 0.914 0.868 0.026 0.015 0.004 -1.240 0.123 0.216
## 12 0.643 0.856 0.716 0.033 0.018 0.005 -0.818 0.246 0.337
## 13 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 14 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 15 0.951 0.979 0.950 0.012 0.006 0.002 -1.841 0.199 0.475
## 16 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 17 0.055 0.492 0.566 0.032 0.031 0.005 -0.074 -0.093 0.006
## 18 0.562 0.788 0.764 0.036 0.024 0.004 -0.759 0.041 0.710
## 19 0.563 0.797 0.743 0.036 0.023 0.005 -0.741 0.089 0.969
## 20 0.800 0.914 0.846 0.026 0.014 0.004 -1.193 0.171 0.154
## 21 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 22 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 23 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 24 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 25 0.314 0.683 0.613 0.038 0.028 0.005 -0.381 0.095 0.350
## 26 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 27 0.000 0.546 0.397 0.000 0.031 0.005 0.072 0.188 0.125
## 28 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 29 0.209 0.665 0.516 0.040 0.028 0.005 -0.233 0.194 0.332
## 30 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 31 0.000 0.601 0.373 0.000 0.030 0.005 0.034 0.289 0.323
## 32 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 33 0.104 0.603 0.490 0.038 0.030 0.005 -0.117 0.143 0.662
## 34 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 35 0.904 0.959 0.912 0.017 0.009 0.003 -1.550 0.194 0.417
## 36 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 37 0.982 0.992 0.981 0.007 0.004 0.002 -2.245 0.171 0.733
## 38 0.366 0.713 0.627 0.038 0.027 0.005 -0.443 0.119 0.440
## 39 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 40 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 41 0.640 0.848 0.738 0.033 0.019 0.005 -0.832 0.194 0.555
## 42 0.278 0.678 0.576 0.039 0.028 0.005 -0.326 0.135 0.096
## 43 0.249 0.647 0.590 0.038 0.029 0.005 -0.302 0.074 0.265
## 44 0.621 0.838 0.734 0.034 0.020 0.005 -0.805 0.179 0.279
## 45 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 46 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 47 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 48 0.480 0.752 0.714 0.037 0.026 0.005 -0.623 0.057 0.201
## 49 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 50 0.058 0.595 0.455 0.038 0.030 0.005 -0.064 0.177 0.896
## 51 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 52 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 53 0.609 0.823 0.756 0.034 0.022 0.005 -0.810 0.116 0.760
## 54 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 55 0.542 0.804 0.688 0.036 0.022 0.005 -0.674 0.183 0.975
## 56 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 57 0.987 0.993 0.998 0.030 0.019 0.002 -2.632 -0.189 0.248
## 58 0.168 0.608 0.551 0.037 0.030 0.005 -0.201 0.072 0.469
## 59 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 60 0.936 0.952 0.999 0.128 0.103 0.001 -2.324 -0.656 0.244
## 61 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 62 0.936 0.952 0.999 0.128 0.103 0.001 -2.324 -0.656 0.244
## 63 0.968 0.981 0.994 0.018 0.012 0.001 -2.303 -0.219 0.821
## 64 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 65 0.944 0.973 0.969 0.016 0.009 0.002 -1.890 0.030 0.961
## 66 0.947 0.963 0.998 0.069 0.053 0.001 -2.302 -0.516 0.876
## 67 0.943 0.972 0.969 0.016 0.009 0.002 -1.887 0.027 0.964
## 68 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 69 0.908 0.947 0.976 0.025 0.017 0.002 -1.796 -0.175 0.963
## 70 0.965 0.987 0.940 0.009 0.004 0.003 -1.889 0.337 0.419
## 71 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 72 0.138 0.661 0.450 0.041 0.028 0.005 -0.145 0.270 0.554
## 73 0.184 0.669 0.483 0.041 0.028 0.005 -0.198 0.239 0.768
## 74 0.046 0.632 0.404 0.042 0.029 0.005 -0.047 0.289 0.495
## 75 0.425 0.703 0.731 0.038 0.028 0.005 -0.575 -0.041 0.241
## 76 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 77 0.000 0.665 0.255 0.000 0.031 0.005 0.116 0.543 0.494
## 78 0.046 0.558 0.484 0.036 0.030 0.005 -0.053 0.093 0.435
## 79 0.000 0.574 0.325 0.000 0.031 0.005 0.134 0.321 0.683
## 80 0.000 0.574 0.325 0.000 0.031 0.005 0.134 0.321 0.683
## 81 0.000 0.574 0.325 0.000 0.031 0.005 0.134 0.321 0.683
## 82 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 83 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
⭐ The data are quite good so we can continue with our analysis
sessionInfo()## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] emoji_0.2.0 vsn_3.64.0
## [3] tximport_1.24.0 forcats_0.5.2
## [5] stringr_1.4.1 dplyr_1.0.10
## [7] purrr_0.3.5 readr_2.1.3
## [9] tidyr_1.2.1 tibble_3.1.8
## [11] tidyverse_1.3.2 pvclust_2.2-0
## [13] plotly_4.10.0 magrittr_2.0.3
## [15] hyperSpec_0.100.0 xml2_1.3.3
## [17] ggplot2_3.3.6 lattice_0.20-45
## [19] here_1.0.1 gplots_3.1.3
## [21] DESeq2_1.36.0 SummarizedExperiment_1.26.1
## [23] Biobase_2.56.0 MatrixGenerics_1.8.1
## [25] matrixStats_0.62.0 GenomicRanges_1.48.0
## [27] GenomeInfoDb_1.32.4 IRanges_2.30.1
## [29] S4Vectors_0.34.0 BiocGenerics_0.42.0
## [31] data.table_1.14.4
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1 lazyeval_0.2.2
## [4] splines_4.2.0 crosstalk_1.2.0 BiocParallel_1.30.4
## [7] digest_0.6.30 htmltools_0.5.3 fansi_1.0.3
## [10] memoise_2.0.1 googlesheets4_1.0.1 limma_3.52.4
## [13] tzdb_0.3.0 Biostrings_2.64.1 annotate_1.74.0
## [16] modelr_0.1.9 vroom_1.6.0 jpeg_0.1-9
## [19] colorspace_2.0-3 blob_1.2.3 rvest_1.0.3
## [22] haven_2.5.1 xfun_0.34 hexbin_1.28.2
## [25] crayon_1.5.2 RCurl_1.98-1.9 jsonlite_1.8.3
## [28] genefilter_1.78.0 survival_3.4-0 glue_1.6.2
## [31] gtable_0.3.1 gargle_1.2.1 zlibbioc_1.42.0
## [34] XVector_0.36.0 DelayedArray_0.22.0 scales_1.2.1
## [37] DBI_1.1.3 Rcpp_1.0.9 viridisLite_0.4.1
## [40] xtable_1.8-4 bit_4.0.4 preprocessCore_1.58.0
## [43] htmlwidgets_1.5.4 httr_1.4.4 RColorBrewer_1.1-3
## [46] ellipsis_0.3.2 farver_2.1.1 pkgconfig_2.0.3
## [49] XML_3.99-0.11 sass_0.4.2 dbplyr_2.2.1
## [52] deldir_1.0-6 locfit_1.5-9.6 utf8_1.2.2
## [55] labeling_0.4.2 tidyselect_1.2.0 rlang_1.0.6
## [58] AnnotationDbi_1.58.0 munsell_0.5.0 cellranger_1.1.0
## [61] tools_4.2.0 cachem_1.0.6 cli_3.4.1
## [64] generics_0.1.3 RSQLite_2.2.18 broom_1.0.1
## [67] evaluate_0.17 fastmap_1.1.0 yaml_2.3.6
## [70] knitr_1.40 bit64_4.0.5 fs_1.5.2
## [73] caTools_1.18.2 KEGGREST_1.36.3 brio_1.1.3
## [76] compiler_4.2.0 rstudioapi_0.14 png_0.1-7
## [79] testthat_3.1.5 affyio_1.66.0 reprex_2.0.2
## [82] geneplotter_1.74.0 bslib_0.4.0 stringi_1.7.8
## [85] highr_0.9 Matrix_1.5-1 vctrs_0.5.0
## [88] pillar_1.8.1 lifecycle_1.0.3 BiocManager_1.30.19
## [91] jquerylib_0.1.4 bitops_1.0-7 R6_2.5.1
## [94] latticeExtra_0.6-30 affy_1.74.0 KernSmooth_2.23-20
## [97] codetools_0.2-18 gtools_3.9.3 assertthat_0.2.1
## [100] rprojroot_2.0.3 withr_2.5.0 GenomeInfoDbData_1.2.8
## [103] hms_1.1.2 rmarkdown_2.17 googledrive_2.0.0
## [106] lubridate_1.8.0 interp_1.1-3